Code
library(tidyverse)
library(janitor)
library(ggplot2)
library(dplyr)
library(rugarch)
library(gganimate)Train 80 %-> real train 75% ,weight train 5% (weight from t-1)
test 20%
library(tidyverse)
library(janitor)
library(ggplot2)
library(dplyr)
library(rugarch)
library(gganimate)stock <- read.csv("data/individual_book_train/stock_100.csv")stock <- stock %>% mutate(
WAP = (bid_price1 * ask_size1 + ask_price1 * bid_size1) / (bid_size1 + ask_size1)
)
stock <- stock %>% mutate(
BidAskSpread = ask_price1 / bid_price1 - 1
)
stock <- stock %>% mutate(
imbalance = abs((bid_size1 - ask_size1) / (bid_size1 + ask_size1))
)
log_rs <- list()
imba_mean <- vector()
BAS_mean <- vector()
#time_IDs <- unique(stock$time_id)
time_IDs <- unique(stock[, 1])[1:20]
for (i in 1 : length(time_IDs)) {
sec <- stock %>% filter(time_id == time_IDs[i]) %>% pull(seconds_in_bucket)
price <- stock %>% filter(time_id == time_IDs[i]) %>% pull(WAP)
imbad <- stock %>% filter(time_id == time_IDs[i]) %>% pull(imbalance)
BASD <- stock %>% filter(time_id == time_IDs[i]) %>% pull(BidAskSpread)
imba_mean[[i]] <- mean(imbad)
BAS_mean[[i]] <- mean(BASD)
log_r <- log(price[-1] / price[1:(length(price) - 1)])
log_rs[[i]] <- data.frame(time = sec[-1], log_return = log_r)
time.no.change <- (1:600)[!(1:600 %in% log_rs[[i]]$time)]
if (length(time.no.change) > 0) {
new.df <- data.frame(time = time.no.change, log_return = 0)
log_rs[[i]] <- rbind(log_rs[[i]], new.df)
log_rs[[i]] <- log_rs[[i]][order(log_rs[[i]]$time), ]
}
}
vol <- list()
comp_vol <- function(x) {
return(sqrt(sum(x ^ 2)))
}
for (i in 1 : length(log_rs)) {
log_rs[[i]] <- log_rs[[i]] %>% mutate(time_bucket = ceiling(time / 30))
vol[[i]] <- aggregate(log_return ~ time_bucket, data = log_rs[[i]], FUN = comp_vol)
colnames(vol[[i]]) <- c('time_bucket', 'volatility')
}#put into cluster base on Plot ***Eye balling***
cluster_l <- vector()
for (i in 1:length(vol)) {
if (BAS_mean[[i]] > 0.15){cluster_l <- 4}
else if (imba_mean[[i]] > 0.61) {cluster_l[[i]] <- 3}
else if (imba_mean[[i]] < 0.45) {cluster_l[[i]] <- 2}
else {cluster_l[[i]] <- 1}
}
cluster_l [1] 3 3 2 1 3 1 2 1 2 2 1 1 1 3 2 1 3 2 2 2
model
spec <- ugarchspec(variance.model = list(model = "eGARCH", garchOrder = c(1, 1)),
mean.model = list(armaOrder = c(0, 0)),
distribution.model = "norm")
ARMA_GARCH.models <- list()
# filter time 450 for first 75% train
for (i in 1 : length(vol)) {
ARMA_GARCH.models[[i]] <- ugarchfit(spec = spec, data = log_rs[[i]] %>%
filter(time <= 450) %>% pull(log_return),
solver = 'hybrid')
}
# 30weight 120 real predict
n_w = 30
n_p = 120
garch_weight <- vector()
pred1 <- list()
pred1_adjust <- list()
for (i in 1 : length(vol)) {
fitted <- rep(1,n_p)
pred1[[i]] <- data_frame(fitted)
fitted <- rep(1:4)
pred1_adjust[[i]] <- data_frame(fitted)
fspec <- getspec(ARMA_GARCH.models[[i]])
setfixed(fspec) <- as.list(coef(ARMA_GARCH.models[[i]]))
future.path <- fitted(ugarchpath(fspec, n.sim = 150, m.sim = 1000))
future.path[is.na(future.path)] <- 0
garch_weight[i] <- mean(sqrt(colSums(future.path[1:30,]^2)))
pred1_adjust[[i]]$fitted[1] <- mean(sqrt(colSums(future.path[31:60,]^2)))
pred1_adjust[[i]]$fitted[2] <- mean(sqrt(colSums(future.path[61:90,]^2)))
pred1_adjust[[i]]$fitted[3] <- mean(sqrt(colSums(future.path[91:120,]^2)))
pred1_adjust[[i]]$fitted[4] <- mean(sqrt(colSums(future.path[121:150,]^2)))
}vol.train <- list()
vol.val <- list()
vol.w <- list()
for (i in 1 : length(log_rs)) {
vol.train[[i]] <- vol[[i]][1:15, ]
vol.val[[i]] <- vol[[i]][-(1:15), ]
}
list.reg <- list()
stocklm <- stock %>% mutate(time_bucket = ceiling(seconds_in_bucket / 30),
num_order = bid_size1 + ask_size1 + bid_size2 + ask_size2)
len.train <- length(vol.train[[1]]$volatility)
for (i in 1 : length(vol)) {
stats.bucket <- stocklm %>%
filter(time_id == time_IDs[i] & time_bucket != 0) %>%
select(c(BidAskSpread, WAP, num_order, time_bucket))
mean.price <- aggregate(WAP ~ time_bucket, data = stats.bucket, FUN = mean)
mean.order <- aggregate(num_order ~ time_bucket, data = stats.bucket, FUN = mean)
mean.BAS <- aggregate(BidAskSpread ~ time_bucket, data = stats.bucket, FUN = mean)
list.reg[[i]] <- data.frame(volatility = vol.train[[i]]$volatility[-1],
price = mean.price$WAP[1:(len.train - 1)],
order = mean.order$num_order[1:(len.train - 1)],
BidAskSpread = mean.BAS$BidAskSpread[1:(len.train - 1)])
}
lm.models <- list()
for (i in 1 : length(vol)) {
lm.models[[i]] <- lm(volatility ~ price + order + BidAskSpread, list.reg[[i]],
weights = 0.8 ^ (((len.train - 2):0) / 2))
}list.reg.val <- list()
len.val <- length(vol.val[[1]]$volatility)
pred.lm <- list()
for (i in 1 : length(vol)) {
stats.bucket <- stocklm %>%
filter(time_id == time_IDs[i] & time_bucket != 0) %>%
select(c(BidAskSpread, WAP, num_order, time_bucket))
mean.price <- aggregate(WAP ~ time_bucket, data = stats.bucket, FUN = mean)
mean.order <- aggregate(num_order ~ time_bucket, data = stats.bucket, FUN = mean)
mean.BAS <- aggregate(BidAskSpread ~ time_bucket, data = stats.bucket, FUN = mean)
list.reg.val[[i]] <-
data.frame(volatility = vol.val[[i]]$volatility,
price = mean.price$WAP[len.train:(len.train + len.val - 1)],
order = mean.order$num_order[len.train:(len.train + len.val - 1)],
BidAskSpread = mean.BAS$BidAskSpread[len.train:(len.train + len.val - 1)])
pred.lm[[i]] <- predict(lm.models[[i]], newdata = list.reg.val[[i]])
}
pred2 <- pred.lmlist.HAV <- list()
for (i in 1 : length(vol)) {
mean.vol <- rep(0, len.train - 5)
for (j in 1 : 5) {
mean.vol <- mean.vol + vol.train[[i]]$volatility[j : (j + len.train - 6)] / 5
}
list.HAV[[i]] <- data.frame(vol = vol.train[[i]]$volatility[-(1:5)],
vol_1 = vol.train[[i]]$volatility[5:(len.train - 1)],
mean_vol_5 = mean.vol)
}
quar <- list()
comp_quar <- function(x) {
return(length(x) / 3 * sum(x ^ 4))
}
for (i in 1 : length(log_rs)) {
quar[[i]] <- aggregate(log_return ~ time_bucket, data = log_rs[[i]], FUN = comp_quar)
colnames(quar[[i]]) <- c('time_bucket', 'quarticity')
}
HAV.wls.models <- list()
for (i in 1 : length(vol)) {
HAV.wls.models[[i]] <- lm(vol ~ vol_1 + mean_vol_5, list.HAV[[i]],
weights = list.HAV[[i]]$vol_1 /
sqrt(quar[[i]]$quarticity[5:(len.train - 1)]))
}pred.hav.all <- list()
for (j in 1:1) {
pred.hav <- list()
latest_obs <- list()
list_HAV1_cluster <- list()
for (i in 1:length(vol)) {
# This will predict 16, 17, 18, 19, 20
latest_obs[[i]] <- vol.train[[i]]$volatility[11:15]
for (t in 1:5) {
# Compute mean volatility for the last 5 observations
mean.vol <- sum(latest_obs[[i]])/5
# Create data frame with updated vol_1 and mean_vol_5
list_HAV1_cluster[[i]] <- data.frame(
vol_1 = latest_obs[[i]][5],
mean_vol_5 = mean.vol
)
pred.hav[[t]] <- unname(predict(HAV.wls.models[[i]], newdata = list_HAV1_cluster[[i]]))
# Drop the oldest observation and add new predicted value
latest_obs[[i]] <- c(latest_obs[[i]][-1], pred.hav[[t]])
}
#cluster_pred_lm[[j]][[i]] <- latest_obs
}
pred.hav.all[[j]] <- latest_obs
}
#pred.hav.all[[1]][[1]][[1]]
pred3 <- list()
for (i in 1:length(vol)){
pred3[[i]] <- pred.hav.all[[1]][[i]]
}cluster 1,3 = HAV + WLR
cluster 2,4 = EGARCH+ WLR
mix <- list()
for(i in 1:length(vol)){
pred_f <- rep(1,4)
mod_a <- rep(1,4)
mod_b <- rep(1,4)
alpha_w <- rep(1,4)
beta_w <- rep(1,4)
val <- rep(1,4)
time <- c(17,18,19,20)
mix[[i]] <- data.frame(time,pred_f,mod_a,mod_b,alpha_w,beta_w,val)
###val
mix[[i]]$val <- vol.val[[i]]$volatility[2:5]
if(cluster_l[[i]] == 1 | cluster_l[[i]] == 3){
mix[[i]]$mod_a <- pred3[[i]][2:5]
mix[[i]]$mod_b <- c(pred2[[i]][[2]],pred2[[i]][[3]],pred2[[i]][[4]],pred2[[i]][[5]])
pa <- garch_weight[[i]]
pb <- pred2[[i]][[1]]
}
else {
mix[[i]]$mod_a <- pred1_adjust[[i]]$fitted
mix[[i]]$mod_b <- c(pred2[[i]][[2]],pred2[[i]][[3]],pred2[[i]][[4]],pred2[[i]][[5]])
pa <- garch_weight[[i]]
pb <- pred2[[i]][[1]]
}
#16 -> 17
a = 0
b = 1
sm_err = 99999
best_a = 0
for(w in 1:11){
m_cal <- a*pa + b*pb
ab_err <- abs(m_cal - vol.val[[i]]$volatility[[1]])
if(ab_err < sm_err){
sm_err <- ab_err
best_a <- a
}
a <- a+0.1
b <- b-0.1
}
mix[[i]]$alpha_w[[1]] <- best_a
mix[[i]]$beta_w[[1]] <- round(1-best_a,digit = 1)
#17-19 -> 18-20 alpha
for(j in 1:3){
a = 0
b = 1
sm_err = 99999
best_a = 0
for(w in 1:11){
m_cal <- a*mix[[i]]$mod_a[[j]] + b*mix[[i]]$mod_b[[j]]
ab_err <- abs(m_cal - vol.val[[i]]$volatility[[j+1]])
if(ab_err < sm_err){
sm_err <- ab_err
best_a <- a
}
a <- a+0.1
b <- b-0.1
}
mix[[i]]$alpha_w[[j+1]] <- best_a
mix[[i]]$beta_w[[j+1]] <- round(1-best_a,digit = 1)
}
###mix
mix[[i]]$pred_f <- ((mix[[i]]$mod_a*mix[[i]]$alpha_w) + (mix[[i]]$mod_b*mix[[i]]$beta_w))
}value table
mix[[1]]
time pred_f mod_a mod_b alpha_w beta_w val
1 17 0.0010260675 0.0010260675 0.0012196350 1 0 0.0015946080
2 18 0.0009863879 0.0010015944 0.0009863879 0 1 0.0007010344
3 19 0.0007984031 0.0009732926 0.0007984031 0 1 0.0007203055
4 20 0.0011301855 0.0010269748 0.0011301855 0 1 0.0008264389
[[2]]
time pred_f mod_a mod_b alpha_w beta_w val
1 17 3.861298e-05 0.0005286728 3.861298e-05 0.0 1.0 0.0006225340
2 18 5.574654e-04 0.0005574654 2.535723e-04 1.0 0.0 0.0003323998
3 19 3.434675e-04 0.0005524532 2.539023e-04 0.3 0.7 0.0001328152
4 20 1.260451e-04 0.0005043462 1.260451e-04 0.0 1.0 0.0001088957
[[3]]
time pred_f mod_a mod_b alpha_w beta_w val
1 17 0.0011106308 0.0008619094 0.0011106308 0 1 0.0011320570
2 18 0.0009069263 0.0008653487 0.0009069263 0 1 0.0010788897
3 19 0.0010573743 0.0008720265 0.0010573743 0 1 0.0007937135
4 20 0.0008682097 0.0008682097 0.0010463319 1 0 0.0007345000
[[4]]
time pred_f mod_a mod_b alpha_w beta_w val
1 17 0.0006207086 0.0006141766 0.0006207086 0 1 0.0004778847
2 18 0.0006178964 0.0006178964 0.0005804300 1 0 0.0005119734
3 19 0.0005614391 0.0006136615 0.0005614391 0 1 0.0007271090
4 20 0.0006155874 0.0006155874 0.0006149317 1 0 0.0001894792
[[5]]
time pred_f mod_a mod_b alpha_w beta_w val
1 17 0.0004222089 0.0002965554 0.0004222089 0 1 0.0001584394
2 18 0.0005178913 0.0005178913 0.0003642260 1 0 0.0007691548
3 19 0.0004737087 0.0004737087 0.0004001583 1 0 0.0006672344
4 20 0.0005147503 0.0005147503 0.0003852155 1 0 0.0008105287
[[6]]
time pred_f mod_a mod_b alpha_w beta_w val
1 17 0.0018954948 0.001254510 0.0018954948 0.0 1.0 0.0013235207
2 18 0.0012837161 0.001294379 0.0011877482 0.9 0.1 0.0009859261
3 19 0.0012316830 0.001317241 0.0012316830 0.0 1.0 0.0011174693
4 20 0.0007417947 0.001332193 0.0007417947 0.0 1.0 0.0015402672
[[7]]
time pred_f mod_a mod_b alpha_w beta_w val
1 17 0.002108687 0.002180509 0.002108687 0 1 0.002893711
2 18 0.002167896 0.002167896 0.001994708 1 0 0.002303492
3 19 0.002166226 0.002166226 0.002135026 1 0 0.001550287
4 20 0.002338768 0.002182540 0.002338768 0 1 0.001924196
[[8]]
time pred_f mod_a mod_b alpha_w beta_w val
1 17 0.001239076 0.001239076 0.001543944 1 0 0.0009875863
2 18 0.001188923 0.001188923 0.001495689 1 0 0.0012000349
3 19 0.001166031 0.001166031 0.001443581 1 0 0.0015288547
4 20 0.001388200 0.001220593 0.001388200 0 1 0.0013299081
[[9]]
time pred_f mod_a mod_b alpha_w beta_w val
1 17 0.0006487408 0.0008120296 0.0006079186 0.2 0.8 0.0005099655
2 18 0.0006925670 0.0008076621 0.0006925670 0.0 1.0 0.0010484798
3 19 0.0008036442 0.0008036442 0.0007425936 1.0 0.0 0.0008481560
4 20 0.0008098678 0.0008098678 0.0006439829 1.0 0.0 0.0005676446
[[10]]
time pred_f mod_a mod_b alpha_w beta_w val
1 17 0.0007456589 0.0008604422 0.0006691367 0.4 0.6 0.0004338248
2 18 0.0004130729 0.0008751826 0.0004130729 0.0 1.0 0.0006972630
3 19 0.0006782799 0.0008679354 0.0003937966 0.6 0.4 0.0004923812
4 20 0.0008804542 0.0008629057 0.0008848413 0.2 0.8 0.0006493527
[[11]]
time pred_f mod_a mod_b alpha_w beta_w val
1 17 0.0004542000 0.0007850124 0.0004542000 0 1 0.0012770825
2 18 0.0008752055 0.0008752055 0.0010498573 1 0 0.0013026599
3 19 0.0011927043 0.0009734260 0.0011927043 0 1 0.0008389981
4 20 0.0009864821 0.0009864821 0.0007859345 1 0 0.0002842181
[[12]]
time pred_f mod_a mod_b alpha_w beta_w val
1 17 0.001854646 0.001854646 0.002346960 1.0 0.0 0.001265334
2 18 0.002406117 0.002406117 0.001859706 1.0 0.0 0.002095780
3 19 0.002634714 0.003425367 0.002107611 0.4 0.6 0.003082850
4 20 0.002404767 0.002400060 0.002415752 0.7 0.3 0.001890465
[[13]]
time pred_f mod_a mod_b alpha_w beta_w val
1 17 0.001214288 0.001214288 0.001226026 1 0 0.0013525503
2 18 0.001372639 0.001239580 0.001372639 0 1 0.0008768654
3 19 0.001215211 0.001215211 0.001270454 1 0 0.0012045310
4 20 0.001047987 0.001047987 0.001075509 1 0 0.0007419980
[[14]]
time pred_f mod_a mod_b alpha_w beta_w val
1 17 0.0006198794 0.0008472465 0.0005630376 0.2 0.8 0.0007970520
2 18 0.0008328118 0.0008445807 0.0007857360 0.8 0.2 0.0001358754
3 19 0.0014102224 0.0008426602 0.0014102224 0.0 1.0 0.0008990069
4 20 0.0008361922 0.0008387691 0.0008129996 0.9 0.1 0.0008202798
[[15]]
time pred_f mod_a mod_b alpha_w beta_w val
1 17 0.001556142 0.001556142 0.001357333 1 0 0.001831960
2 18 0.001545818 0.001545818 0.001560010 1 0 0.001643141
3 19 0.001521939 0.001545572 0.001521939 0 1 0.001586889
4 20 0.001546706 0.001546706 0.001767904 1 0 0.001668429
[[16]]
time pred_f mod_a mod_b alpha_w beta_w val
1 17 0.0006770833 0.001394837 0.0006770833 0.0 1.0 0.0013167039
2 18 0.0016030774 0.001688887 0.0008307884 0.9 0.1 0.0009321254
3 19 0.0010604730 0.001730484 0.0009860273 0.1 0.9 0.0008096176
4 20 0.0005770483 0.001635698 0.0005770483 0.0 1.0 0.0010109599
[[17]]
time pred_f mod_a mod_b alpha_w beta_w val
1 17 0.0013945786 0.0008653706 0.001453379 0.1 0.9 0.0016703868
2 18 0.0020582878 0.0018215213 0.002058288 0.0 1.0 0.0011431227
3 19 0.0008880908 0.0008880908 0.001339176 1.0 0.0 0.0009476834
4 20 0.0018048855 0.0018334588 0.001547726 0.9 0.1 0.0021947340
[[18]]
time pred_f mod_a mod_b alpha_w beta_w val
1 17 0.002780547 0.002780547 0.001782293 1.0 0.0 0.003299317
2 18 0.002755109 0.002755109 0.001912485 1.0 0.0 0.001602806
3 19 0.001338368 0.002723073 0.001338368 0.0 1.0 0.002310615
4 20 0.002431167 0.002763754 0.001655130 0.7 0.3 0.001912041
[[19]]
time pred_f mod_a mod_b alpha_w beta_w val
1 17 0.0007837318 0.0008617958 0.0007837318 0 1 0.0007625953
2 18 0.0007486862 0.0008586767 0.0007486862 0 1 0.0006761409
3 19 0.0014565198 0.0008600704 0.0014565198 0 1 0.0008144871
4 20 0.0008628539 0.0008628539 0.0004255822 1 0 0.0009530073
[[20]]
time pred_f mod_a mod_b alpha_w beta_w val
1 17 0.0013848057 0.0007870111 0.0013848057 0.0 1.0 0.0009485897
2 18 0.0007931042 0.0007846858 0.0008127469 0.7 0.3 0.0003690774
3 19 0.0007907497 0.0007907497 0.0014807308 1.0 0.0 0.0006364187
4 20 0.0007887304 0.0007887304 0.0014257837 1.0 0.0 0.0009045089
plot
all_plot <- list()
for(i in 1:length(vol)){
weight_p = ""
for(j in 1:nrow(mix[[i]])){
weight_p = paste(
weight_p,as.character(mix[[i]]$time[[j]]),"=","(",
as.character(mix[[i]]$alpha_w[[j]],1),":",as.character(mix[[i]]$beta_w[[j]],1),")"
)
}
all_plot[[i]] <- ggplot(mix[[i]], aes(x=time)) +
geom_line(aes(y = val,color = "Real Volatility"))+
geom_line(aes(y = mod_a,color = "Model a"), linetype="twodash")+
geom_line(aes(y = mod_b,color = "Model b"), linetype="twodash")+
geom_line(aes(y = pred_f,color = "Mix Model"), linetype="twodash")+
scale_color_manual(name = "Model", values = c(
"Real Volatility" = "red",
"Model a"="lightblue",
"Model b"="green",
"Mix Model" = "blue"))+
theme_classic()+
labs(
title = paste("Prediction Result"),
tag = as.character(i),
subtitle = paste("cluster ",as.character(cluster_l[[i]]),
if(cluster_l[[i]] == 1|cluster_l[[i]] == 3){mod = "HAV + WLR"}
else {mod = "EGARCH + WLR"},
"\n weight for each time interval : \n",
weight_p
),
x = "Time interval",
y = "Volatility",
caption = "each time interval = 30 seconds"
)#+
#transition_reveal(time)
}all_plot[[1]]
[[2]]
[[3]]
[[4]]
[[5]]
[[6]]
[[7]]
[[8]]
[[9]]
[[10]]
[[11]]
[[12]]
[[13]]
[[14]]
[[15]]
[[16]]
[[17]]
[[18]]
[[19]]
[[20]]
all_plot[[5]]+transition_reveal(time)